Figure 5:Diversification is most dramatic in exposed regions of the VSG

Assess genomic coverage of various Tbg isolates to VSG expressed in patients. Run bowtie using a variety of read lengths from sequenced T.b. gambiense genomes and calculate/compare coverage of assembled patient VSG from our study.





Console commands used to run analysis

packages used:

  • Trim_Galore (version 0.5.0)

  • Trimmomatic (version 0.38)

  • Bowtie with the parameters -v 2 -a -S (version 1.1.1)

  • Bedtools (version 2.27.0)

  • samtools


# raw sequencing read files each uploaded to their own directory designated by strain name
# datasets provided as paired end reads concatenated into single fq file and analyzed as if single end
# we can iterate through each genome using their distinctive directory names with this bash script

for path in seqFiles/*; do
        SAMPLEID="$(basename "${path}")"
# quality and adapter trim raw reads
trim_galore --dont_gzip -o seqFiles/$SAMPLEID/ seqFiles/$SAMPLEID/*.fq

# truncate reads to desired lengths
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/20BP.fq CROP:20
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/30BP.fq CROP:30
trimmomatic SE -threads 4 seqFiles/$SAMPLEID/*_trimmed.fq seqFiles/$SAMPLEID/50BP.fq CROP:50

# generated an indexed bowtie reference for Tbg Patient VSG called tbgPatientsbt
# run bowtie for each read query size
(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/20BP.fq seqFiles/$SAMPLEID/20BP.bt.sam) 2> seqFiles/$SAMPLEID/20BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/20BP.bt.sam -o seqFiles/$SAMPLEID/20BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/20BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.20BP.txt

(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/30BP.fq seqFiles/$SAMPLEID/30BP.bt.sam) 2> seqFiles/$SAMPLEID/30BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/30BP.bt.sam -o seqFiles/$SAMPLEID/30BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/30BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.30BP.txt

(bowtie -a --best -v 2 -S tbgPatientsbt seqFiles/$SAMPLEID/50BP.fq seqFiles/$SAMPLEID/50BP.bt.sam) 2> seqFseqFiles/$SAMPLEID/50BPbowtieSE.txt
samtools view -bS seqFiles/$SAMPLEID/50BP.bt.sam -o seqFiles/$SAMPLEID/50BP.bt.bam
bedtools genomecov -ibam seqFiles/$SAMPLEID/50BP.bt.bam -bga > seqFiles/$SAMPLEID/genomecov.50BP.txt

done

Functions for importing bedtools genomecov data

Use iRanges R package to collapse overlapping reads into single range of VSG gene coverage


Upload Data

Available in FigureData directory as compressed .zip files

Analyze: calculate percent coverage

# calculate percent coverage
calc_cov <- function(df){
  pct_cov <- df %>% group_by(genome, new_name) %>% 
    summarize(aligned = sum(width)) %>% 
    full_join(df %>% select(new_name, seqlen) %>% distinct()) %>% 
    mutate(pct_coverage = (aligned / seqlen)*100)
  return(pct_cov)
}

# merge with all the VSG with 0 hits

calc_cov_20bp <- calc_cov(bt20bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
# no VSG had 0 coverage with 20bp reads
calc_cov_30bp <- calc_cov(bt30bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
calc_cov_30bp <- rbind(calc_cov_30bp, nohits30bp %>% 
                         mutate(aligned = 0) %>% 
                         mutate(pct_coverage = 0) %>% 
                         select(genome, new_name, aligned, seqlen, pct_coverage))

calc_cov_50bp <- calc_cov(bt50bp)
## `summarise()` has grouped output by 'genome'. You can override using the
## `.groups` argument.
## Joining, by = "new_name"
calc_cov_50bp <- rbind(calc_cov_50bp, nohits50bp %>% 
                         mutate(aligned = 0) %>% 
                         mutate(pct_coverage = 0) %>% 
                         select(genome, new_name, aligned, seqlen, pct_coverage))

find patient N-term domain boundaries as determined by HMMscan pipeline

Compare coverage by domain

# focus on 30bp set for more in-depth analysis
Nbound <- read_csv("FigureData/AllPatient_orf_VSGs_merged-translated_NtermSummary.csv", show_col_types = FALSE) %>% select(original_seqID, nterm_seq_length) %>% mutate(nterm_coord = nterm_seq_length*3)
colnames(Nbound)[1] <- "seq_ID"

plot30bp <- merge(bt30bp[grepl("Gambiense", bt30bp$new_name), ], genomeMeta)
plot30bp <- left_join(plot30bp, Nbound, by = 'seq_ID') %>% 
  mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                            year > "2004" ~ "DRC 2000's",
                            TRUE ~ "Cote d'Ivoire 2000's"))

# pick up any ranges that span the C-term/N-term boundary
DomSpans <-plot30bp[plot30bp$start <= plot30bp$nterm_coord & plot30bp$end >= plot30bp$nterm_coord, ] %>%
  select(new_name, genome, start, end, nterm_coord, seqlen)

addtoNterm <- DomSpans %>% select(new_name, genome, start, nterm_coord) %>% dplyr::rename(end = nterm_coord) %>% mutate(width = (end - start)+1)

addtoCterm <- DomSpans %>% select(new_name, genome,nterm_coord, end) %>% dplyr::rename(start = nterm_coord) %>% mutate(width = (end - start)+1)

Nterm <- plot30bp[plot30bp$end <= plot30bp$nterm_coord, ] %>% 
  select(new_name, genome, start, end, width) %>%
  rbind(.,addtoNterm) %>%
  distinct() %>%
  group_by(new_name, genome) %>% 
  summarize(aligned = sum(width)) %>% 
  merge(plot30bp %>% select(new_name, nterm_coord) %>% distinct()) %>% 
  mutate(pct_coverage = (aligned / nterm_coord)*100) %>%
  select(new_name, genome, pct_coverage, aligned, nterm_coord) %>%
  dplyr::rename(domain_length = nterm_coord)
## `summarise()` has grouped output by 'new_name'. You can override using the
## `.groups` argument.
# make entries for VSG with no hits
patientVSG <- unique(plot30bp$new_name)
genomes <- unique(bt30bp$genome)

add0Nhit <- data.frame(new_name = character(),
                       genome = character(),
                       pct_coverage = double())
for (i in 1:36){
  y <- data.frame(new_name = setdiff(patientVSG, Nterm[Nterm$genome == genomes[i], ] %>% .$new_name)) %>%
    mutate(genome = genomes[i]) %>%
    mutate(pct_coverage = 0, aligned = 0)
  add0Nhit <- rbind(add0Nhit, y)
}

add0Nhit <- inner_join(add0Nhit, plot30bp %>% select(new_name, nterm_coord)) %>% distinct() %>% dplyr::rename(domain_length = nterm_coord)
## Joining, by = "new_name"
Nterm <- rbind(Nterm, add0Nhit)


Cterm <- plot30bp[plot30bp$start >= plot30bp$nterm_coord, ] %>% 
  select(new_name, genome, start, end, width) %>%
  rbind(.,addtoCterm) %>%
  distinct() %>%
  group_by(new_name, genome) %>% 
  summarize(aligned = sum(width)) %>% 
  merge(plot30bp %>% select(new_name, nterm_coord, seqlen) %>% distinct()) %>%
  mutate(pct_coverage = (aligned / (seqlen - nterm_coord + 1))*100, domain_length = (seqlen - nterm_coord + 1)) %>%
  select(new_name, genome, pct_coverage, aligned, domain_length)
## `summarise()` has grouped output by 'new_name'. You can override using the
## `.groups` argument.
# make entries for VSG with no hits
add0Chit <- data.frame(new_name = character(),
                       genome = character(),
                       pct_coverage = double())
for (i in 1:36){
  y <- data.frame(new_name = setdiff(patientVSG, Cterm[Cterm$genome == genomes[i], ] %>% .$new_name)) %>%
    mutate(genome = genomes[i]) %>%
    mutate(pct_coverage = 0, aligned = 0)
  add0Chit <- rbind(add0Chit, y)
}

add0Chit <- inner_join(add0Chit, plot30bp %>% select(new_name, nterm_coord, seqlen)) %>% mutate(domain_length = seqlen - nterm_coord +1) %>% distinct()
## Joining, by = "new_name"
add0Chit$domain_length <- replace(add0Chit$domain_length, add0Chit$domain_length<0, 0)

Cterm <- rbind(Cterm, add0Chit %>% select(-nterm_coord,-seqlen))

Fig. 5A: plot N-term/C-term domain coverage

Pick a representative set for ease of viewing (141BT, BRAZO, B4_4163P), we can see from full density plots that most datasets tend to follow similar distribution.

Density plot showing the percentage of each of the patient VSG ORF sequence that had at least one whole genome sequencing read (30bp length) align for each of three representative whole genome datasets.

Density plots of coverage over full VSG ORF

dist20bp <- calc_cov_20bp[grepl("Gambiense", calc_cov_20bp$new_name), ] %>% 
  inner_join(genomeMeta) %>% 
  mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                                 year > "2004" ~ "DRC 2000's",
                                 TRUE ~ "Cote d'Ivoire 2000's")) %>% 
  mutate(query_size = "20bp")
## Joining, by = "genome"
dist30bp <- calc_cov_30bp[grepl("Gambiense", calc_cov_30bp$new_name), ] %>% 
  inner_join(genomeMeta) %>% 
  mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                                 year > "2004" ~ "DRC 2000's",
                                 TRUE ~ "Cote d'Ivoire 2000's")) %>% 
  mutate(query_size = "30bp")
## Joining, by = "genome"
dist50bp <- calc_cov_50bp[grepl("Gambiense", calc_cov_50bp$new_name), ] %>% 
  inner_join(genomeMeta) %>% 
  mutate(dist_facets = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                                 year > "2004" ~ "DRC 2000's",
                                 TRUE ~ "Cote d'Ivoire 2000's")) %>% 
  mutate(query_size = "50bp")
## Joining, by = "genome"
g1 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "Cote d'Ivoire 1980's", ], dist30bp[dist30bp$dist_facets == "Cote d'Ivoire 1980's", ], dist50bp[dist50bp$dist_facets == "Cote d'Ivoire 1980's", ]) %>% .[!.$genome == "tbgDAL972", ],
       aes(x = pct_coverage, fill = genome)) +
  geom_density(alpha = 0.4) +
  scale_color_viridis(discrete = T) +
  scale_fill_viridis(discrete = T) +
  xlab("Percent Patient VSG Gene Coverage") +
  ggtitle("Cote d'Ivoire 1980's") +
  facet_wrap(~query_size, ncol = 1, scales = "free_y") +
  scale_x_continuous(limits = c(0,100)) +
  theme_bw() 

g2 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "Cote d'Ivoire 2000's", ], dist30bp[dist30bp$dist_facets == "Cote d'Ivoire 2000's", ], dist50bp[dist50bp$dist_facets == "Cote d'Ivoire 2000's", ]),
       aes(x = pct_coverage, fill = genome)) +
  geom_density(alpha = 0.4) +
  scale_color_viridis(discrete = T) +
  scale_fill_viridis(discrete = T) +
  xlab("Percent Patient VSG Gene Coverage") +
  ggtitle("Cote d'Ivoire 2000's") +
  facet_wrap(~query_size, ncol = 1, scales = "free_y") +
  scale_x_continuous(limits = c(0,100)) +
  theme_bw() 

g3sum <- rbind(dist20bp[dist20bp$dist_facets == "DRC 2000's", ], dist30bp[dist30bp$dist_facets == "DRC 2000's", ], dist50bp[dist50bp$dist_facets == "DRC 2000's", ]) %>% group_by(query_size, genome) %>% summarize(pct_coverage = mean(pct_coverage))
## `summarise()` has grouped output by 'query_size'. You can override using the
## `.groups` argument.
g3 <- ggplot(data = rbind(dist20bp[dist20bp$dist_facets == "DRC 2000's", ], dist30bp[dist30bp$dist_facets == "DRC 2000's", ], dist50bp[dist50bp$dist_facets == "DRC 2000's", ]),
       aes(x = pct_coverage, fill = genome)) +
#  geom_vline(data = g3sum, aes(xintercept = pct_coverage, color = genome), linetype = "dashed") +
  geom_density(alpha = 0.4) +
#  scale_color_viridis(discrete = T) +
  scale_fill_viridis(discrete = T) +
  xlab("Percent Patient VSG Gene Coverage") +
  ggtitle("DRC 2000's") +
  facet_wrap(~query_size, ncol = 1, scales = "free_y") +
  scale_x_continuous(limits = c(0,100)) +
  theme_bw() 

plot_grid(g1, g2, g3, ncol = 3)

Representatives for Fig. 5A

distDomains <- rbind(Cterm %>% mutate(domain = 'C-Terminal Domain'), Nterm %>% mutate(domain = 'N-Terminal Domain')) %>%
  convert_as_factor(domain, genome) %>%
  merge(plot30bp %>% select(origin, year, genome) %>% distinct()) %>%
  ungroup()

distDomains %>% mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                                               year > "2004" ~ "DRC 2000's",
                                               TRUE ~ "Cote d'Ivoire 2000's")) %>%
  .[.$genome == "141BT" | .$genome == "BRAZO" | .$genome == "B4-4163P", ] %>%
  ggplot(aes(x = pct_coverage, fill = groups)) +
  geom_density(alpha = 0.4) +
  scale_color_viridis(discrete = T) +
  scale_fill_viridis(discrete = T) +
  xlab("Percent Patient VSG Gene Coverage") +
  ggtitle("Domain Coverage Distribution") +
  facet_wrap(~domain, ncol = 1, scales = "free_y") +
  scale_x_continuous(limits = c(0,100)) +
  theme_bw() 

Fig. 5B: dotplots of VSG representation within field isolate genomes

Plots comparing sequence representation within the patient VSG N-terminal and C-terminal domains for each group. Representation for each VSG is quantified as the proportion of nucleotides in each domain with at least one alignment to the total number of nucleotides in that domain, with the average representation of all VSGs for each genome shown. Significant differences between groups were determined using Kruskal-Wallis followed by a post-hoc Dunn’s test

# an absolute quantification of representation by each genome by number of bases represented

Represented <- rbind(Cterm %>% mutate(domain = "C-term"), Nterm %>% mutate(domain = "N-term")) %>% group_by(domain, genome) %>% summarise(totals = sum(domain_length), aligned  = sum(aligned)) %>% mutate(proportion = aligned / totals)
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
Represented <- merge(Represented, genomeMeta) %>% 
  mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's",
                            year > "2004" ~ "DRC 2000's",
                            TRUE ~ "Cote d'Ivoire 2000's"))

Rep_summary <- Represented %>% group_by(domain, groups) %>% summarise(stddev = sd(proportion), proportion = mean(proportion))
## `summarise()` has grouped output by 'domain'. You can override using the
## `.groups` argument.
Nplot <- ggplot(Represented[Represented$domain == "N-term", ], aes(x = groups, y = proportion, fill = groups)) + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.8, position = position_dodge(1)) +
  theme_bw() +
  scale_fill_viridis(discrete = T) + 
  ggtitle("N-terminal Domain Representation") +
  ylab("Nucleotides Aligned / Total VSG Nucleotides") +
  xlab("") + 
  theme(legend.position = "none") + 
  geom_crossbar(data = Rep_summary[Rep_summary$domain == "N-term", ], 
                aes(ymin = proportion, ymax = proportion), size = 0.3, color = "black") + 
  geom_errorbar(data = Rep_summary[Rep_summary$domain == "N-term", ], 
                aes(ymin = (proportion - stddev), ymax = (proportion + stddev)), width = 0.3, color = "black") + scale_y_continuous(limits = c(0,0.2))


Cplot <- ggplot(Represented[Represented$domain == "C-term", ], aes(x = groups, y = proportion, fill = groups)) + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.8, position = position_dodge(1)) +
  theme_bw() +
  scale_fill_viridis(discrete = T) + 
  ggtitle("C-terminal Domain Representation") +
  ylab("Nucleotides Aligned / Total VSG Nucleotides") +
  xlab("") + 
  theme(legend.position = "none") + 
  geom_crossbar(data = Rep_summary[Rep_summary$domain == "C-term", ], 
                aes(ymin = proportion, ymax = proportion), size = 0.3, color = "black") + 
  geom_errorbar(data = Rep_summary[Rep_summary$domain == "C-term", ], 
                aes(ymin = (proportion - stddev), ymax = (proportion + stddev)), width = 0.3, color = "black") + scale_y_continuous(limits = c(0.5,0.85))

plot_grid(Nplot, Cplot, ncol = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Statistics

  • kruskal wallis test: nonparametric, independent groups, no repeated measures
  • post hoc dunn test: pairwise, determine which groups are significantly different from each other

N-terms

kruskal_test(Represented[Represented$domain == "N-term", ], proportion ~ groups)
## # A tibble: 1 × 6
##   .y.            n statistic    df        p method        
## * <chr>      <int>     <dbl> <int>    <dbl> <chr>         
## 1 proportion    36      17.6     2 0.000151 Kruskal-Wallis
dunn_test(Represented[Represented$domain == "N-term", ], proportion ~ groups, p.adjust.method = "holm")
## # A tibble: 3 × 9
##   .y.        group1    group2    n1    n2 statistic       p   p.adj p.adj.signif
## * <chr>      <chr>     <chr>  <int> <int>     <dbl>   <dbl>   <dbl> <chr>       
## 1 proportion Cote d'I… Cote …    12    12    0.0194 9.85e-1 9.85e-1 ns          
## 2 proportion Cote d'I… DRC 2…    12    12    3.64   2.70e-4 8.10e-4 ***         
## 3 proportion Cote d'I… DRC 2…    12    12    3.62   2.91e-4 8.10e-4 ***

C-terms

kruskal_test(Represented[Represented$domain == "C-term", ], proportion ~ groups)
## # A tibble: 1 × 6
##   .y.            n statistic    df        p method        
## * <chr>      <int>     <dbl> <int>    <dbl> <chr>         
## 1 proportion    36      15.2     2 0.000498 Kruskal-Wallis
dunn_test(Represented[Represented$domain == "C-term", ], proportion ~ groups, p.adjust.method = "holm")
## # A tibble: 3 × 9
##   .y.        group1    group2    n1    n2 statistic       p   p.adj p.adj.signif
## * <chr>      <chr>     <chr>  <int> <int>     <dbl>   <dbl>   <dbl> <chr>       
## 1 proportion Cote d'I… Cote …    12    12    -0.213 8.31e-1 0.831   ns          
## 2 proportion Cote d'I… DRC 2…    12    12     3.27  1.09e-3 0.00218 **          
## 3 proportion Cote d'I… DRC 2…    12    12     3.48  5.04e-4 0.00151 **

Fig5C: Determine where sequences of homology fall on phyre2 predicted VSG structures

Make a map of coverage across each VSG ORF. Regions with at least one alignment from any of the 36 genomic datasets are shown in gray.

#read phyre2 output metadata file
##these are the top hits
phyre2 <- read_delim(file = "FigureData/Phyre2_summaryinfo.txt", delim = "|", show_col_types = FALSE)
colnames(phyre2)[1] <- "seq_ID"
# remove whitespace
phyre2$seq_ID <- gsub(" ", "", phyre2$seq_ID)
# join by cluster ref
phyre2 <- inner_join(phyre2, cluster_ref, by = "seq_ID")

# find regions of no homology, 0 alignments by any genome
noSimilarity <- data.frame(start = double(),
                     end = double(),
                     width = double(),
                     seq_ID = character())
for (i in 1:length(unique(plot30bp$seq_ID))) {
    irR <- IRanges(plot30bp %>%
                   .[.$seq_ID == unique(plot30bp$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$start,
                 end = plot30bp %>%
                   .[.$seq_ID == unique(plot30bp$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$end)
    # collapse overlapping regions to calculate VSG coverage
    cov <- IRanges::reduce(irR) %>% as.data.frame()
    cov <- cov %>% mutate(seq_ID = unique(plot30bp$seq_ID)[i])
    noSimilarity <- rbind(noSimilarity, cov)
}

noSimilarity <- inner_join(noSimilarity, phyre2, by = "seq_ID") %>% 
    merge(plot30bp %>% select(new_name, seqlen, nterm_coord) %>% distinct())

Plot regions of no similarity for all VSG

noSimilarity %>% ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = 0, ymax = 1)) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name) +
  xlab("Position (bp)") +
  ggtitle("30bp query") +
  theme_bw() +
  geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "blue", size = 0.6) +
  geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "red", size = 0.6)

Gambiense 248: type B with highest overall coverage

noSimilarity[noSimilarity$new_name == "Gambiense 248", ] %>% ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = 0, ymax = 1)) +
  facet_wrap(~new_name, nrow = 1) +
  xlab("Position (bp)") +
  theme_bw() +
  geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
  geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8)

Gambiense 400: type A with highest overall coverage

noSimilarity[noSimilarity$new_name == "Gambiense 400", ] %>% ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = 0, ymax = 1)) +
  facet_wrap(~new_name, nrow = 1) +
  xlab("Position (bp)") +
  theme_bw() +
  geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
  geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8) + scale_x_continuous(limits = c(0,1500))

Gambiense 452: type A VSG of around average coverage

noSimilarity[noSimilarity$new_name == "Gambiense 452", ] %>% ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = 0, ymax = 1)) +
  facet_wrap(~new_name, nrow = 1) +
  xlab("Position (bp)") +
  theme_bw() +
  geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "royalblue", size = 0.8) +
  geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "palegreen", size = 0.8) +
  scale_x_continuous(limits = c(0,1500))

Table with percent coverage for each VSG by all datasets

Similarity_cov <- noSimilarity %>% group_by(new_name) %>% 
    summarize(aligned = sum(width)) %>% 
    full_join(bt30bp %>% select(new_name, seqlen) %>% distinct()) %>%  
    mutate(pct_coverage = (aligned / seqlen)*100)
## Joining, by = "new_name"
Similarity_cov
## # A tibble: 57 × 4
##    new_name      aligned seqlen pct_coverage
##    <chr>           <int>  <dbl>        <dbl>
##  1 Gambiense 114     768   1491         51.5
##  2 Gambiense 119     342   1491         22.9
##  3 Gambiense 169     246   1473         16.7
##  4 Gambiense 173     258   1473         17.5
##  5 Gambiense 175     316   1473         21.5
##  6 Gambiense 195     363   1464         24.8
##  7 Gambiense 2       813   1665         48.8
##  8 Gambiense 218     559   1458         38.3
##  9 Gambiense 234     538   1455         37.0
## 10 Gambiense 238     349   1455         24.0
## # … with 47 more rows

Supplemental Figure 8

plot20bp <- inner_join(bt20bp, genomeMeta, by = "genome") %>% 
  mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's", 
                            year > "2004" ~ "DRC 2000's",
                            TRUE ~ "Cote d'Ivoire 2000's"))

nohits30bp <- nohits30bp %>% rename("depth" = "width") %>% mutate(end = 0)
supp_plot30bp <- rbind(bt30bp, nohits30bp) %>%
  inner_join(., genomeMeta, by = "genome") %>% 
  mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's", 
                            year > "2004" ~ "DRC 2000's",
                            TRUE ~ "Cote d'Ivoire 2000's"))

nohits50bp <- nohits50bp %>% rename("depth" = "width") %>% mutate(end = 0)
plot50bp <- rbind(bt50bp, nohits50bp) %>%
  inner_join(., genomeMeta, by = "genome") %>% 
  mutate(groups = case_when(year < "1999" ~ "Cote d'Ivoire 1980's", 
                            year > "2004" ~ "DRC 2000's",
                            TRUE ~ "Cote d'Ivoire 2000's"))

p1 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "Cote d'Ivoire 1980's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3) +
  xlab("Position (bp)") +
  ggtitle("Cote d'Ivoire 1980's Isolates: 20bp query < 2 mismatches") +
  theme_bw()
p2 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "Cote d'Ivoire 2000's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3) +
  xlab("Position (bp)") +
  ggtitle("Cote d'Ivoire 2000's Isolates: 20bp query < 2 mismatches") +
  theme_bw()
p3 <- plot20bp[!(grepl("Gambiense", plot20bp$new_name)) & plot20bp$groups == "DRC 2000's" & !(plot20bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3) +
  xlab("Position (bp)") +
  ggtitle("DRC 2000's isolates: 20bp query < 2 mismatches") +
  theme_bw()

plot_grid(p1,p2,p3, ncol = 3)

p4 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "Cote d'Ivoire 1980's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
  xlab("Position (bp)") +
  ggtitle("Cote d'Ivoire 1980's Isolates: 30bp query < 2 mismatches") +
  theme_bw()

p5 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "Cote d'Ivoire 2000's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
  xlab("Position (bp)") +
  ggtitle("Cote d'Ivoire 2000's Isolates: 30bp query < 2 mismatches") +
  theme_bw()

p6 <- supp_plot30bp[!(grepl("Gambiense", supp_plot30bp$new_name)) & supp_plot30bp$groups == "DRC 2000's" & !(supp_plot30bp$new_name == "Tbg_LiTat1.5"), ] %>%
  group_by(groups) %>%
  mutate(plotbins = as.factor(genome) %>% as.numeric()) %>% 
  ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = genome), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  facet_wrap(~new_name, scales = "free_x", ncol = 3, drop = FALSE) +
  xlab("Position (bp)") +
  ggtitle("DRC 2000's Isolates: 30bp query < 2 mismatches") +
  theme_bw()

plot_grid(p4, p5, p6, ncol = 3)

calculate % coverage for the mapping controls

# pull % coverage values for controls
mapCTl20bp <- calc_cov_20bp[!(grepl("Gambiense", calc_cov_20bp$new_name)), ]
mapCTl30bp <- calc_cov_30bp[!(grepl("Gambiense", calc_cov_30bp$new_name)), ]

# calculate average coverage for each control gene
mapCTl20bp %>% group_by(new_name) %>% summarise(mean_cov_20bp = mean(pct_coverage))
## # A tibble: 13 × 2
##    new_name     mean_cov_20bp
##    <chr>                <dbl>
##  1 Caeel_Ric-3           81.6
##  2 Caeel_Sol1            77.1
##  3 Caeel_zyg11           68.7
##  4 Drome_fas3            72.3
##  5 Drome_LBR             60.3
##  6 Drome_Wee1            66.8
##  7 K12_enolase           56.3
##  8 K12_ftsZ              57.8
##  9 K12_TufA              55.8
## 10 Tbg_GAPDH            100. 
## 11 Tbg_LiTat1.3          99.4
## 12 Tbg_LiTat1.5          99.3
## 13 Tbg_TGSGP             99.5
mapCTl20bp[!(grepl("Tbg", mapCTl20bp$new_name)), ] %>% group_by(new_name) %>% summarise(mean_cov_20bp = mean(pct_coverage)) %>% .$mean_cov_20bp %>% mean()
## [1] 66.30604
mapCTl30bp %>% group_by(new_name) %>% summarise(mean_cov_30bp = mean(pct_coverage))
## # A tibble: 13 × 2
##    new_name     mean_cov_30bp
##    <chr>                <dbl>
##  1 Caeel_Ric-3          0.475
##  2 Caeel_Sol1           1.89 
##  3 Caeel_zyg11          0.984
##  4 Drome_fas3           1.34 
##  5 Drome_LBR            0.857
##  6 Drome_Wee1           0.486
##  7 K12_enolase          2.05 
##  8 K12_ftsZ             1.45 
##  9 K12_TufA             3.18 
## 10 Tbg_GAPDH          100.   
## 11 Tbg_LiTat1.3        96.2  
## 12 Tbg_LiTat1.5        98.5  
## 13 Tbg_TGSGP           97.7
mapCTl30bp[!(grepl("Tbg", mapCTl30bp$new_name)), ] %>% group_by(new_name) %>% summarise(mean_cov_30bp = mean(pct_coverage)) %>% .$mean_cov_30bp %>% mean()
## [1] 1.413298

Supplemental Figure 9

Summary of Bowtie alignment hits for each assembled gHAT patient VSG against the genomic sequences.

Base-pair coordinates of each patient VSG is plotted as the X-axis, and each facet designates the patient VSG as well as the full ORF sequence length. Bars color-coded by genome dataset group show alignment length and position within the VSG ORF sequence for genomic sequence fragments of 30bp in length.

# make plot of ranges by genome group. Collapse ranges for all genomes isolated in same year/region

CDI1980 <- plot30bp[plot30bp$year < "1999", ]
CDI2000 <- plot30bp[plot30bp$year > "1999" & plot30bp$year < "2005", ]
DRC2000 <- plot30bp[plot30bp$year > "2004", ]

CDI1980.noSimilarity <- data.frame(start = double(),
                     end = double(),
                     width = double(),
                     seq_ID = character())
for (i in 1:length(unique(CDI1980$seq_ID))) {
    irR <- IRanges(CDI1980 %>%
                   .[.$seq_ID == unique(CDI1980$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$start,
                 end = CDI1980 %>%
                   .[.$seq_ID == unique(CDI1980$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$end)
    # collapse overlapping regions to calculate VSG coverage
    cov <- IRanges::reduce(irR) %>% as.data.frame()
    cov <- cov %>% mutate(seq_ID = unique(CDI1980$seq_ID)[i])
    CDI1980.noSimilarity <- rbind(CDI1980.noSimilarity, cov)
}

CDI2000.noSimilarity <- data.frame(start = double(),
                     end = double(),
                     width = double(),
                     seq_ID = character())
for (i in 1:length(unique(CDI2000$seq_ID))) {
    irR <- IRanges(CDI2000 %>%
                   .[.$seq_ID == unique(CDI2000$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$start,
                 end = CDI2000 %>%
                   .[.$seq_ID == unique(CDI2000$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$end)
    # collapse overlapping regions to calculate VSG coverage
    cov <- IRanges::reduce(irR) %>% as.data.frame()
    cov <- cov %>% mutate(seq_ID = unique(CDI2000$seq_ID)[i])
    CDI2000.noSimilarity <- rbind(CDI2000.noSimilarity, cov)
}

DRC2000.noSimilarity <- data.frame(start = double(),
                     end = double(),
                     width = double(),
                     seq_ID = character())
for (i in 1:length(unique(DRC2000$seq_ID))) {
    irR <- IRanges(DRC2000 %>%
                   .[.$seq_ID == unique(DRC2000$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$start,
                 end = DRC2000 %>%
                   .[.$seq_ID == unique(DRC2000$seq_ID)[i], ] %>%
                   select(seq_ID, start, end) %>%
                   distinct() %>%
                   .$end)
    # collapse overlapping regions to calculate VSG coverage
    cov <- IRanges::reduce(irR) %>% as.data.frame()
    cov <- cov %>% mutate(seq_ID = unique(DRC2000$seq_ID)[i])
    DRC2000.noSimilarity <- rbind(DRC2000.noSimilarity, cov)
}

GroupRanges <- rbind(CDI1980.noSimilarity %>% 
                       mutate(group = "Cote d'Ivoire 1980's"), 
                     CDI2000.noSimilarity %>% 
                       mutate(group = "Cote d'Ivoire 2000's"), 
                     DRC2000.noSimilarity %>% mutate(group = "DRC 2000's")) %>%
  mutate(plotbins = as.factor(.$group) %>% as.numeric()) %>% 
  inner_join(phyre2, by = "seq_ID") %>% 
  merge(plot30bp %>% select(new_name, nterm_coord, seqlen) %>% unique())


GroupRanges %>% ggplot() +
  geom_rect(aes(xmin = start, xmax = end,
                       ymin = plotbins, ymax = plotbins + 0.9,
                fill = group), show.legend = T) +
  scale_fill_viridis(discrete = T) +
  geom_vline(aes(xintercept = seqlen), linetype = "solid", color = "blue", size = 0.6) +
  geom_vline(aes(xintercept = nterm_coord), linetype = "solid", color = "red", size = 0.6) +
  facet_wrap(~new_name) +
  xlab("Position (bp)") +
  ggtitle("Position and Range of Bowtie Alignments: 30bp query < 2 mismatches") +
  theme_bw()






## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rstatix_0.7.0       cowplot_1.1.1       colourvalues_0.3.7 
##  [4] viridis_0.6.2       viridisLite_0.4.0   IRanges_2.26.0     
##  [7] S4Vectors_0.30.2    BiocGenerics_0.38.0 forcats_0.5.1      
## [10] stringr_1.4.0       dplyr_1.0.9         purrr_0.3.4        
## [13] readr_2.1.2         tidyr_1.2.0         tibble_3.1.7       
## [16] ggplot2_3.3.6       tidyverse_1.3.1     reshape2_1.4.4     
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.3       sass_0.4.1       bit64_4.0.5      vroom_1.5.7     
##  [5] jsonlite_1.8.0   carData_3.0-5    modelr_0.1.8     bslib_0.3.1     
##  [9] assertthat_0.2.1 highr_0.9        cellranger_1.1.0 yaml_2.3.5      
## [13] pillar_1.7.0     backports_1.4.1  glue_1.6.2       digest_0.6.29   
## [17] rvest_1.0.2      colorspace_2.0-3 htmltools_0.5.2  plyr_1.8.7      
## [21] pkgconfig_2.0.3  broom_0.8.0      haven_2.5.0      scales_1.2.0    
## [25] tzdb_0.3.0       farver_2.1.0     generics_0.1.2   car_3.1-0       
## [29] ellipsis_0.3.2   withr_2.5.0      cli_3.3.0        magrittr_2.0.3  
## [33] crayon_1.5.1     readxl_1.4.0     evaluate_0.15    fs_1.5.2        
## [37] fansi_1.0.3      xml2_1.3.3       tools_4.1.1      hms_1.1.1       
## [41] lifecycle_1.0.1  munsell_0.5.0    reprex_2.0.1     compiler_4.1.1  
## [45] jquerylib_0.1.4  rlang_1.0.2      grid_4.1.1       rstudioapi_0.13 
## [49] labeling_0.4.2   rmarkdown_2.14   gtable_0.3.0     abind_1.4-5     
## [53] DBI_1.1.3        R6_2.5.1         gridExtra_2.3    lubridate_1.8.0 
## [57] knitr_1.39       bit_4.0.4        fastmap_1.1.0    utf8_1.2.2      
## [61] stringi_1.7.6    Rcpp_1.0.8.3     vctrs_0.4.1      dbplyr_2.2.0    
## [65] tidyselect_1.1.2 xfun_0.31
 

By Jaime So for the Mugnier Lab

jso4@jhmi.edu